package au.com.acpfg.align.local; import jaligner.Alignment; import jaligner.Sequence; import jaligner.SmithWatermanGotoh; import jaligner.matrix.Matrix; import jaligner.matrix.MatrixLoader; import jaligner.matrix.MatrixLoaderException; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URL; import java.util.*; import java.util.jar.JarInputStream; import java.util.zip.ZipEntry; import neobio.alignment.CrochemoreLandauZivUkelsonGlobalAlignment; import neobio.alignment.CrochemoreLandauZivUkelsonLocalAlignment; import neobio.alignment.IncompatibleScoringSchemeException; import neobio.alignment.InvalidScoringMatrixException; import neobio.alignment.InvalidSequenceException; import neobio.alignment.NeedlemanWunsch; import neobio.alignment.PairwiseAlignment; import neobio.alignment.PairwiseAlignmentAlgorithm; import neobio.alignment.ScoringMatrix; import neobio.alignment.SmithWaterman; import org.knime.core.data.*; import org.knime.core.data.collection.ListCell; import org.knime.core.data.collection.SetCell; import org.knime.core.data.container.CloseableRowIterator; import org.knime.core.data.def.*; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.defaultnodesettings.*; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; /** * This is the model implementation of SequenceAligner. * Performs an alignment, performed by http://jaligner.sourceforge.net of two sequences using the chosen parameters * * @author Andrew Cassin */ public class SequenceAlignerNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(SequenceAlignerNodeModel.class); final static String CFG_ALIGN_TYPE = "alignment-type"; final static String CFG_BUILTIN_MATRIX = "builtin-matrix"; final static String CFG_GAP_PENALTY_OPEN = "gap-penalty-open"; final static String CFG_GAP_PENALTY_EXTEND = "gap-penalty-extend"; final static String CFG_ACCSN_COL = "accession-column"; final static String CFG_SEQ_COL = "sequence-column"; final static String CFG_SEQ2_COL = "sequence2-column"; final static String CFG_IS_PAIRWISE = "pairwise?"; final static String CFG_WANTED = "wanted-output-columns"; final static String[] DEF_ALIGNMENT_FORMATS= new String[] {"FASTA", "CLUSTALW", "BLAST" }; private SettingsModelString m_align_type; private SettingsModelDouble m_gap_penalty_open; private SettingsModelDouble m_gap_penalty_extend; private SettingsModelColumnName m_accsn_col; private SettingsModelString m_seq_col; private SettingsModelString m_builtin_matrix; private SettingsModelString m_is_pairwise; private SettingsModelString m_seq2_col; private SettingsModelStringArray m_wanted; private Matrix m_jalign_matrix; // both scoring matrices are computed, even if only one is used during execute() private ScoringMatrix m_neobio_matrix; /** * Constructor for the node model. */ protected SequenceAlignerNodeModel() { // one incoming, one outgoing port super(1, 1); m_align_type = (SettingsModelString) make(CFG_ALIGN_TYPE); m_gap_penalty_open = (SettingsModelDouble) make(CFG_GAP_PENALTY_OPEN); m_gap_penalty_extend = (SettingsModelDouble) make(CFG_GAP_PENALTY_EXTEND); m_accsn_col = (SettingsModelColumnName) make(CFG_ACCSN_COL); m_seq_col = (SettingsModelColumnName) make(CFG_SEQ_COL); m_builtin_matrix = (SettingsModelString) make(CFG_BUILTIN_MATRIX); m_is_pairwise = (SettingsModelString) make(CFG_IS_PAIRWISE); m_seq2_col = (SettingsModelColumnName) make(CFG_SEQ2_COL); m_seq2_col.setEnabled(!m_is_pairwise.isEnabled()); m_wanted = (SettingsModelStringArray) make(CFG_WANTED); m_jalign_matrix = null; m_neobio_matrix = null; } public static SettingsModel make(String field_name) { if (field_name.equals(CFG_ALIGN_TYPE)) { return new SettingsModelString(CFG_ALIGN_TYPE, "local"); } else if (field_name.equals(CFG_BUILTIN_MATRIX)) { return new SettingsModelString(CFG_BUILTIN_MATRIX, "PAM250"); } else if (field_name.equals(CFG_GAP_PENALTY_OPEN)) { return new SettingsModelDoubleBounded(CFG_GAP_PENALTY_OPEN, 10.0, 0, 1000); } else if (field_name.equals(CFG_GAP_PENALTY_EXTEND)) { return new SettingsModelDoubleBounded(CFG_GAP_PENALTY_EXTEND, 2.0, 0, 1000); } else if (field_name.equals(CFG_ACCSN_COL)) { return new SettingsModelColumnName(CFG_ACCSN_COL, ""); } else if (field_name.equals(CFG_SEQ_COL)) { return new SettingsModelColumnName(CFG_SEQ_COL, ""); } else if (field_name.equals(CFG_IS_PAIRWISE)) { return new SettingsModelString(CFG_IS_PAIRWISE, "1col"); } else if (field_name.equals(CFG_SEQ2_COL)) { return new SettingsModelColumnName(CFG_SEQ2_COL, ""); } else if (field_name.equals(CFG_WANTED)) { return new SettingsModelStringArray(CFG_WANTED, new String[] { "Accessions", "Original Sequences", "Score" }); } return null; } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { int accsn_col = inData[0].getDataTableSpec().findColumnIndex(m_accsn_col.getColumnName()); int sequence_col = inData[0].getDataTableSpec().findColumnIndex(m_seq_col.getStringValue()); boolean is_pairwise = m_is_pairwise.getStringValue().equals("1col"); boolean use_rid = m_accsn_col.useRowID(); if ((!is_pairwise && sequence_col < 0) || (is_pairwise && ((!use_rid && accsn_col < 0) || sequence_col < 0))) { throw new Exception("Cannot locate column: have you configured the node correctly?"); } // log summary of node execution double gap_open_penalty = m_gap_penalty_open.getDoubleValue(); double gap_extend_penalty = m_gap_penalty_extend.getDoubleValue(); logger.info(m_is_pairwise.getStringValue()); int n_rows = inData[0].getRowCount(); int n_align = is_pairwise ? (n_rows * n_rows - n_rows) : n_rows; logger.info("Sequence Alignment... beginning execution"); logger.info("Alignment type: " + m_align_type.getStringValue()); logger.info("Number of alignments to be performed: "+ n_align); logger.info("Scoring matrix used: "+m_builtin_matrix.getStringValue()); if (m_align_type.getStringValue().toLowerCase().contains("jalign")) { // only specified for JAligner for now... logger.info("Gap open penalty: " + gap_open_penalty); logger.info("Gap extend penalty: " + gap_extend_penalty); } logger.info("Pairwise? "+is_pairwise); // compute the matrix for both neobio and jaligner (even though only one of them will be used, FOR NOW) try { m_jalign_matrix = MatrixLoader.load(m_builtin_matrix.getStringValue()); String mat = getMatrix(m_builtin_matrix.getStringValue()); m_neobio_matrix = new ScoringMatrix(new StringReader(mat)); } catch (MatrixLoaderException mle) { throw new Exception("Aborting! Cannot load matrix from: "+m_builtin_matrix.getStringValue()); } // execute based on supplied columns... if (is_pairwise) { return execute_pairwise(inData, exec, gap_open_penalty, gap_extend_penalty, n_rows, accsn_col, sequence_col); } else { return execute_pairs(inData, exec, gap_open_penalty, gap_extend_penalty, n_rows, accsn_col, sequence_col); } } protected BufferedDataTable[] execute_pairwise(final BufferedDataTable[] inData, final ExecutionContext exec, double gap_open_penalty, double gap_extend_penalty, int n_rows, int accsn_col, int sequence_col) throws Exception { DataTableSpec outputSpec = AlignmentReporter.getTableSpec(m_wanted.getStringArrayValue()); BufferedDataContainer container = exec.createDataContainer(outputSpec); Hashtable<String,String> ht = new Hashtable<String,String>(); RowIterator rows = inData[0].iterator(); Vector<String> accsns = new Vector<String>(); while (rows.hasNext()) { DataRow r = rows.next(); String accsn; if (accsn_col > 0) { // using column or <RowID>? accsn = r.getCell(accsn_col).toString(); } else { accsn = r.getKey().getString(); } String seq = r.getCell(sequence_col).toString(); ht.put(accsn, seq); accsns.add(accsn); } int done = 0; int align_id= 1; for (int i=0; i<n_rows; i++) { for (int j=0; j<n_rows; j++) { if (i != j) { String a1 = accsns.get(i); String a2 = accsns.get(j); String str1 = ((String) ht.get(a1)).trim().replaceAll("\\s+", ""); String str2 = ((String) ht.get(a2)).trim().replaceAll("\\s+", ""); //logger.info("Lengths for sequences: " + str1.length() + " " + str2.length() ); container.addRowToTable(do_alignment(align_id++, a1, str1, a2, str2, m_align_type.getStringValue(), outputSpec, gap_open_penalty, gap_extend_penalty)); done++; } // check if the execution monitor was canceled if (done % 30 == 0) { exec.checkCanceled(); exec.setProgress(done / (double)(n_rows*n_rows), "Done " + done + " alignments"); } } } // once we are done, we close the container and return its table container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[]{out}; } protected BufferedDataTable[] execute_pairs(final BufferedDataTable[] inData, final ExecutionContext exec, double gap_open_penalty, double gap_extend_penalty, int n_rows, int accsn_col, int sequence_col) throws Exception { int sequence2_col = inData[0].getDataTableSpec().findColumnIndex(m_seq2_col.getStringValue()); if (sequence2_col < 0) { throw new Exception("Cannot locate sequence2 column: have you configured the node correctly?"); } if (sequence_col == sequence2_col) { throw new Exception("Cannot use the same column for both sequences!"); } // in this case we can just append the results to the columns, no need to output everything we did for the pairwise case DataTableSpec alignment_spec = AlignmentReporter.getTableSpec(m_wanted.getStringArrayValue()); BufferedDataContainer container = exec.createDataContainer(alignment_spec, false, 0); int done = 0; CloseableRowIterator it = inData[0].iterator(); int align_no = 1; while (it.hasNext()) { DataRow r = it.next(); DataCell c1 = r.getCell(sequence_col); DataCell c2 = r.getCell(sequence2_col); if (c1.isMissing() || c2.isMissing()) continue; String[] vec = null; if (c2.getType().isCollectionType()) { if (c2 instanceof ListCell) { ListCell l2 = (ListCell) c2; if (l2.size() < 1) continue; vec = new String[l2.size()]; int cnt = 0; for (DataCell c : l2) { vec[cnt++] = c.toString(); } l2 = null; } else if (c2 instanceof SetCell) { SetCell s2 = (SetCell) c2; if (s2.size()< 1) continue; vec = new String[s2.size()]; int cnt = 0; for (DataCell c : s2) { vec[cnt++] = c.toString(); } } else { throw new Exception("Unknown collection cell: "+c2.getType().toString()+ ", aborting!"); } } else { vec = new String[] { c2.toString() }; } String str1 = c1.toString().trim().replaceAll("\\s+", ""); c1 = null; c2 = null; for (String str2 : vec) { //logger.info("Lengths for sequences: " + str1.length() + " " + str2.length() ); str2 = str2.trim().replaceAll("\\s+", ""); container.addRowToTable(do_alignment(align_no, "s1", str1, "s2", str2, m_align_type.getStringValue(), alignment_spec, gap_open_penalty, gap_extend_penalty)); align_no++; } vec = null; done++; // check if the execution monitor was canceled if (done % 30 == 0) { exec.checkCanceled(); exec.setProgress(done / (double)n_rows, "Done " + done + " rows"); } r = null; } // once we are done, we close the container and return its table container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[]{out}; } private DataRow do_alignment(int align_id, String a1, String str1, String a2, String str2, String align_type, DataTableSpec spec, double gap_open_penalty, double gap_extend_penalty) throws FileNotFoundException, IOException, InvalidSequenceException, IncompatibleScoringSchemeException, InvalidScoringMatrixException { AlignmentReporter ar; if (align_type.indexOf("Local - JAligner") >= 0) { Sequence seq1 = new Sequence(str1); Sequence seq2 = new Sequence(str2); Alignment a = SmithWatermanGotoh.align(seq1, seq2, m_jalign_matrix, (float) gap_open_penalty, (float) gap_extend_penalty); ar = new AlignmentReporter(a, a1, str1, a2, str2); } else { StringReader sr1 = new StringReader(str1); StringReader sr2 = new StringReader(str2); PairwiseAlignmentAlgorithm algorithm; if (align_type.startsWith("Local")) { algorithm = align_type.endsWith("Waterman") ? new SmithWaterman() : new CrochemoreLandauZivUkelsonLocalAlignment(); } else { algorithm = align_type.endsWith("Wunsch") ? new NeedlemanWunsch() : new CrochemoreLandauZivUkelsonGlobalAlignment(); } // TODO: neobio does not support gap open/extend cost model... fix this? algorithm.setScoringScheme (m_neobio_matrix); algorithm.loadSequences (sr1, sr2); // now compute the alignment PairwiseAlignment alignment = algorithm.getPairwiseAlignment(); sr1.close(); sr2.close(); ar = new AlignmentReporter(alignment, a1, str1, a2, str2); } return ar.getRow(align_id, spec); } /** * {@inheritDoc} */ @Override protected void reset() { // TODO Code executed on reset. // Models build during execute are cleared here. // Also data handled in load/saveInternals will be erased here. } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { DataTableSpec ts = AlignmentReporter.getTableSpec(m_wanted.getStringArrayValue()); return new DataTableSpec[]{ts}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_align_type.saveSettingsTo(settings); m_gap_penalty_open.saveSettingsTo(settings); m_gap_penalty_extend.saveSettingsTo(settings); m_accsn_col.saveSettingsTo(settings); m_seq_col.saveSettingsTo(settings); m_seq2_col.saveSettingsTo(settings); m_builtin_matrix.saveSettingsTo(settings); m_is_pairwise.saveSettingsTo(settings); m_wanted.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_align_type.loadSettingsFrom(settings); m_gap_penalty_open.loadSettingsFrom(settings); m_gap_penalty_extend.loadSettingsFrom(settings); m_accsn_col.loadSettingsFrom(settings); m_seq_col.loadSettingsFrom(settings); m_seq2_col.loadSettingsFrom(settings); m_builtin_matrix.loadSettingsFrom(settings); m_is_pairwise.loadSettingsFrom(settings); m_wanted.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_align_type.validateSettings(settings); m_gap_penalty_open.validateSettings(settings); m_gap_penalty_extend.validateSettings(settings); m_accsn_col.validateSettings(settings); m_seq_col.validateSettings(settings); m_seq2_col.validateSettings(settings); m_builtin_matrix.validateSettings(settings); m_is_pairwise.validateSettings(settings); m_wanted.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * Returns the list of available scoring matrices (shared between neobio & jaligner) * @return list of available matrix names eg. BLOSUM62, PAM250 etc. */ public static String[] getBuiltinScoringMatrices() { ArrayList<String> ret = new ArrayList<String>(); try { InputStream inputStream = SequenceAlignerNodePlugin.getJAlignerJARStream(); JarInputStream jis= new JarInputStream(inputStream); ZipEntry ze = null; while ((ze = jis.getNextEntry()) != null) { String name = ze.getName(); if (name.startsWith("jaligner/matrix/matrices/")) { ret.add(name.substring("jaligner/matrix/matrices/".length())); } } jis.close(); } catch (IOException e) { e.printStackTrace(); } return ret.toArray(new String[0]); } /** * Returns an input stream to the jAligner (NCBI) scoring matrix of the specified name (case sensitive) * eg. BLOSUM62 * * @param stringValue the name of the NCBI-compatible scoring matrix (in uppercase) * @return a handle to the data or null if it does not exist */ private String getMatrix(String stringValue) throws IOException { InputStream inputStream= SequenceAlignerNodePlugin.getJAlignerJARStream(); JarInputStream jis= new JarInputStream(inputStream); ZipEntry ze= null; while ((ze = jis.getNextEntry()) != null) { if (ze.getName().equals("jaligner/matrix/matrices/"+stringValue)) { BufferedReader br = new BufferedReader(new InputStreamReader(jis)); StringBuffer sb = new StringBuffer(4 * 1024); String line = null; while ((line = br.readLine()) != null) { sb.append(line); sb.append('\n'); } return sb.toString(); } } return null; } }